Prophage sequence similarity using ANI and MASH

Author

Lakhansing Pardeshi

Published

May 23, 2024

This script process ANI and MASH output for the filtered prophages and stores the distance matrices and clustering dendrograms.

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(treeio))
suppressPackageStartupMessages(library(ggtree))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(corrplot))

# summarize prophage ANI and MASH results
# cluster prophages based on ANI distance

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/heatmap_utils.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

outDir <- confs$analysis$prophages$dir

Process ANI and MASH output and hierarchical clustering of the data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId 
)

phages <- suppressMessages(
  readr::read_tsv(confs$analysis$prophages$preprocessing$files$filtered)
) %>% 
  dplyr::select(
    prophage_id, fragments, nFragments, prophage_length, nHg, genomeId
  )

nodeOfInterest <- dplyr::select(phages, prophage_id)

process ANI data for phages and store ANI and ANI-distance matrices

Code
aniDf <- suppressMessages(readr::read_tsv(
  file = confs$data$prophages$files$ani,
  col_names = c("id1", "id2", "ani", "mapped", "total")
)) %>%
  dplyr::mutate(
    dplyr::across(
      .cols = c(id1, id2),
      .fns = ~ stringr::str_replace(string = .x, pattern = ".*/(.*).fna", replacement = "\\1")
    ),
    dist = 1 - (ani / 100)
  )

aniDist <- dplyr::left_join(
  x = nodeOfInterest,
  y = tidyr::pivot_wider(
    data = aniDf,
    id_cols = "id1",
    names_from = "id2",
    values_from = "dist",
    values_fill = 1
  ),
  by = c("prophage_id" = "id1")
) %>%
  dplyr::select(prophage_id, all_of(nodeOfInterest$prophage_id))

readr::write_tsv(
  x = aniDist, file = confs$analysis$prophages$preprocessing$files$ani_dist
)

aniDistMat <- tibble::column_to_rownames(aniDist, var = "prophage_id") %>%
  as.matrix() %>%
  as.dist()

## store UPGMA and NJ trees
# plot(hclust(distMat))
aniUpgma <- ape::as.phylo(hclust(d = aniDistMat, method = "complete")) %>%
  ape::ladderize() %>%
  ape::makeNodeLabel(method = "number", prefix = "n")

ape::write.tree(
  phy = aniUpgma, tree.names = "ani_hclust",
  file = confs$analysis$prophages$preprocessing$files$ani_hclust
)

# plot(ape::root(phy = treeUpgma, outgroup = sampleInfoList[[outGroup]]$Genome, edgelabel = TRUE))
# nodelabels()

aniNj <- ape::nj(aniDistMat) %>%
  ape::ladderize() %>%
  ape::makeNodeLabel(method = "number", prefix = "n")

## set negative length edges => 0
aniNj$edge.length[aniNj$edge.length < 0] <- 0

ape::write.tree(
  phy = aniNj, tree.names = "ANI_NJ",
  file = confs$analysis$prophages$preprocessing$files$ani_nj
)

process MASH data and generate trees

Code
mashDf <- suppressMessages(readr::read_tsv(
  file = confs$data$prophages$files$mash,
  col_names = c("id1", "id2", "dist", "pvalue", "mapped")
)) %>%
  dplyr::mutate(
    dplyr::across(
      .cols = c(id1, id2),
      .fns = ~ stringr::str_replace(
        string = .x, pattern = ".*/(.*).fna", replacement = "\\1"
      )
    )
  )

# mashDf %>%
#   dplyr::filter(id1 %in% !!nodeOfInterest$id, id2 %in% !!nodeOfInterest$id) %>%
#   dplyr::filter(id1 != id2) %>%
#   dplyr::arrange(dist, id1, id2) %>%
#   dplyr::left_join(
#     y = dplyr::select(prophageDf, id1 = prophage_id, species1 = SpeciesName),
#     by = "id1"
#   ) %>%
#   dplyr::left_join(
#     y = dplyr::select(prophageDf, id2 = prophage_id, species2 = SpeciesName),
#     by = "id2"
#   ) %>%
#   clipr::write_clip()

mashDist <- dplyr::left_join(
  x = nodeOfInterest,
  y = tidyr::pivot_wider(
    data = mashDf,
    id_cols = "id1",
    names_from = "id2",
    values_from = "dist",
    values_fill = 1
  ),
  by = c("prophage_id" = "id1")
) %>%
  dplyr::select(prophage_id, all_of(nodeOfInterest$prophage_id))


readr::write_tsv(
  x = mashDist, file = confs$analysis$prophages$preprocessing$files$mash_dist
)

mashDistMat <- tibble::column_to_rownames(mashDist, var = "prophage_id") %>%
  as.matrix() %>%
  as.dist()

## store UPGMA and NJ trees
# plot(hclust(distMat))
mashUpgma <- ape::as.phylo(hclust(d = mashDistMat, method = "complete")) %>%
  ape::ladderize() %>%
  ape::makeNodeLabel(method = "number", prefix = "n")

ape::write.tree(
  phy = mashUpgma, tree.names = "mash_hclust",
  file = confs$analysis$prophages$preprocessing$files$mash_hclust
)


mashNj <- ape::nj(mashDistMat) %>%
  ape::ladderize() %>%
  ape::makeNodeLabel(method = "number", prefix = "n")

## set negative length edges => 0
mashNj$edge.length[mashNj$edge.length < 0] <- 0

ape::write.tree(
  phy = mashNj, tree.names = "mash_nj",
  file = confs$analysis$prophages$preprocessing$files$mash_nj
)

correlate MASH and ANI trees

Code
## cophenetic distance calculation for NJ and UPGMA tree
copheneticDist <- tibble::tibble(
  aniDist = as.vector(aniDistMat),
  aniUpgma = as.vector(as.dist(cophenetic(aniUpgma))),
  aniNj = as.vector(as.dist(cophenetic(aniNj))),
  mashDist = as.vector(mashDistMat),
  mashUpgma = as.vector(as.dist(cophenetic(mashUpgma))),
  mashNj = as.vector(as.dist(cophenetic(mashNj)))
)

M <- cor(copheneticDist)

corrplot::corrplot(
  M,
  type = "lower", tl.col = "black",
  cl.ratio = 0.2, tl.srt = 45, col = COL2("BrBG"), addCoef.col = "white"
)

Visualize distance matrices with clustering

Visualize MASH UPGMA tree

Code
treeTbl <- treeio::as_tibble(mashUpgma) %>%
  dplyr::left_join(y = phages, by = c("label" = "prophage_id")) %>%
  treeio::as.treedata()

pt_treeUpgma <- ggtree::ggtree(
  tr = treeTbl
) +
  ggtree::geom_nodelab(
    mapping = aes(label = label),
    node = "internal", size = 3, hjust = 1.3, color = "red"
  ) +
  ggtree::geom_tiplab(
    size = 2, align = TRUE, linesize = 0.5
  ) +
  scale_x_continuous(expand = expansion(mult = c(0.05, 0.5))) +
  ggnewscale::new_scale_color()

# ggsave(
#   plot = pt_treeUpgma, width = 8, height = 20, scale = 2,
#   filename = paste(outDir, "/prophages.MASH.UPGMA_tree.pdf", sep = "")
# )

Visualize ANI distances

Code
ht_ani <- plot_species_ANI_heatmap(
  mat = 100 * (1 - as.matrix(aniDistMat)),
  name = "ani",
  phy = aniUpgma, speciesInfo = NULL,
  col = circlize::colorRamp2(
    breaks = c(0, 50, 80, 90, 91, 92, 93, 93.5, 94, 95, 96, 97, 99),
    colors = viridisLite::viridis(n = 13, option = "B")
  ),
  heatmap_legend_param = list(
    direction = "horizontal", legend_width = unit(5, "cm")
  )
)

# pdf(
#   file = file.path(outDir, "prophage_ANI_heatmap.pdf"),
#   width = 10, height = 10
# )
# ComplexHeatmap::draw(
#   object = ht_ani,
#   main_heatmap = "ani",
#   row_dend_side = "left",
#   heatmap_legend_side = "bottom"
# )
# dev.off()

Visualize MASH distances

Code
ht_mash <- plot_species_ANI_heatmap(
  mat = as.matrix(mashDistMat),
  phy = mashUpgma, speciesInfo = NULL,
  name = "mash",
  col = circlize::colorRamp2(
    breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
    # breaks = seq(0, 1, length.out = 13),
    colors = viridisLite::viridis(n = 13, option = "B")
  ),
  heatmap_legend_param = list(
    direction = "horizontal", legend_width = unit(5, "cm")
  )
)

# pdf(
#   file = file.path(outDir, "prophage_MASH_heatmap.pdf"),
#   width = 10, height = 10
# )
# ComplexHeatmap::draw(
#   object = ht_mash,
#   main_heatmap = "mash",
#   row_dend_side = "left",
#   heatmap_legend_side = "bottom"
# )
# dev.off()